import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
# 设置 matplotlib 中文显示字体
plt.rcParams['font.family'] = 'Microsoft YaHei'
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
def draw_roc(y_true, y_score, title):
# 计算ROC曲线
fpr, tpr, thresholds = roc_curve(y_true, y_score)
# 计算AUC
auc = roc_auc_score(y_true, y_score)
# 绘制ROC曲线
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title(title)
plt.legend(loc="lower right")
plt.show()
混淆矩阵¶
# 绘制混淆矩阵
def draw_confusion_matrix(y_true, y_pred, title):
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# 计算混淆矩阵
cm = confusion_matrix(y_true, y_pred)
# 绘制混淆矩阵
plt.figure()
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(title)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
评估指标与分类报告¶
# 绘制分类报告
def draw_classification_report(y_true, y_pred, title):
from sklearn.metrics import classification_report
import seaborn as sns
import matplotlib.pyplot as plt
# 计算分类报告
report = classification_report(y_true, y_pred, target_names=['Benign', 'Malignant'], output_dict=True)
# 绘制分类报告
plt.figure()
sns.heatmap(pd.DataFrame(report).iloc[:-1, :].T, annot=True)
plt.title(title)
plt.show()
# 绘制评估指标
def draw_metrics(y_true, y_pred, title):
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import matplotlib.pyplot as plt
# 计算评估指标
accuracy = accuracy_score(y_true, y_pred)
precision, recall, f1score, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
# 绘制评估指标
fig, ax = plt.subplots(figsize=(6, 2))
metrics = {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1-score': f1score}
labels = list(metrics.keys())
values = list(metrics.values())
for idx, value in enumerate(values):
ax.text(value, idx, f'{value:.4f}', ha='right', va='center')
bar = ax.barh(labels, values, color='#02ABEC')
plt.xlim(0, 1)
plt.title(title)
plt.show()
决策边界¶
def draw_decision_boundary(X, y, model, title, x_columns, display_features = ['texture_mean', 'concavity_mean']):
from mlxtend.plotting import plot_decision_regions
import matplotlib.pyplot as plt
# 选择两个特征
feature_idx = [x_columns.index(feature) for feature in display_features]
filter_idx = [i for i in range(X.shape[1]) if i not in feature_idx]
# 绘制决策边界
plot_decision_regions(
X, y, clf=model, legend=2,
feature_index=feature_idx,
filler_feature_values={i: 0 for i in filter_idx},
filler_feature_ranges={i: 3 for i in filter_idx},
ax=plt.subplots(1, 1, figsize=(8, 5))[1]
)
plt.xlabel(display_features[0], size=10)
plt.ylabel(display_features[1], size=10)
plt.title(title, size=12)
plt.show()
学习曲线¶
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import numpy as np
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
# print(f"train_sizes: {train_sizes}, train_scores_mean: {train_scores_mean}, test_scores_mean: {test_scores_mean}")
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
plt.show()
return
数据集¶
数据集使用 Kaggle - PRIYANKA/Breast Cancer Wisconsin ,
数据列: id, diagnosis, radius_mean, texture_mean, perimeter_mean, area_mean, smoothness_mean, compactness_mean, concavity_mean, concave points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, perimeter_worst, area_worst, smoothness_worst, compactness_worst, concavity_worst, concave points_worst, symmetry_worst, fractal_dimension_worst
diagnosis:诊断结果,M代表恶性(malignant),B代表良性(benign)。radius:细胞核的半径。texture:灰度值,反映了细胞核的纹理。perimeter:细胞核的周长。area:细胞核的面积。smoothness:半径长度的变化,反映了细胞核的平滑度。compactness:周长平方除以面积减去1,反映了细胞核的紧密度。concavity:轮廓的凹部的严重程度。concave points:轮廓凹部的数量。symmetry:细胞核对称性。fractal_dimension:细胞核的分形维数,反映了细胞核的复杂度。
dataset = pd.read_csv("./data/breast cancer.csv")
x_columns = ["radius_mean", "texture_mean", "smoothness_mean", "area_mean", "compactness_mean", "concavity_mean",
"concave points_mean", "symmetry_mean", "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se",
"smoothness_se", "smoothness_worst", "radius_worst", "concavity_se"]
X = dataset.loc[:, x_columns].values
y = dataset.loc[:, "diagnosis"].values
数据集特征分布¶
sns.pairplot(dataset.loc[:, [*x_columns, "diagnosis"]],hue = 'diagnosis', palette = {'M': '#FFC000', 'B': '#02ABEC'})
<seaborn.axisgrid.PairGrid at 0x7c347aa65750>
all_x_columns = ["radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean", "concavity_mean", "concave points_mean", "symmetry_mean", "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se", "compactness_se", "concavity_se", "concave points_se", "symmetry_se", "fractal_dimension_se", "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst", "compactness_worst", "concavity_worst", "concave points_worst", "symmetry_worst", "fractal_dimension_worst"]
all_X = dataset.loc[:, all_x_columns].values
df = pd.DataFrame(all_X, columns=all_x_columns)
df['diagnosis'] = y
plt.figure(figsize=(18, 14))
plt.suptitle('特征分布 - 卢继鹏', position=(0.5, 0.90))
for i, col in enumerate(all_x_columns):
plt.subplot(4, 8, i + 1)
sns.violinplot(y='diagnosis', x=col, data=df, color='#02ABEC')
plt.show()
将数据集分割为训练集和测试集,其中75%的数据用于训练,25%的数据用于测试。
使用StandardScaler对数据进行标准化,使数据的平均值为0,标准差为1。
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# 编码分类标签,将B良性编码为0,M恶性编码为1
y_train = np.where(y_train == 'M', 1, 0)
y_test = np.where(y_test == 'M', 1, 0)
import matplotlib.pyplot as plt
feature_idx = 0
plt.figure(figsize=(5, 2))
plt.suptitle('数据归一化 - 卢继鹏', position=(0.5, 1.1))
plt.subplot(1, 2, 1)
plt.hist(X[:, feature_idx], bins=30, color='#02ABEC')
plt.title('Original Data')
plt.subplot(1, 2, 2)
plt.hist(X_train[:, feature_idx], bins=30, color='#02ABEC')
plt.title('Standardized Data')
plt.show()
Sigmoid核SVM¶
构建模型¶
使用scikit-learn库中的支持向量机对数据进行训练的过程。
创建一个SVC类的实例,使用Sigmoid核函数,并设置随机状态种子为0,以确保结果的可重复性。
使用X_train和y_train训练SVM分类器。
from sklearn.svm import SVC
classifier_sigmoid = SVC(kernel='sigmoid', random_state=0, probability=True)
classifier_sigmoid.fit(X_train, y_train)
SVC(kernel='sigmoid', probability=True, random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(kernel='sigmoid', probability=True, random_state=0)
模型决策边界¶
选择两个特征,使用plot_decision_regions绘制分类器的决策边界在这两个特征的投影。可以看到分类器在二维特征空间中的分类效果。
draw_decision_boundary(X_train, y_train, classifier_sigmoid, 'Sigmoid核 SVM 决策边界 - 卢继鹏', x_columns)
学习曲线¶
使用learning_curve绘制学习曲线,观察训练集和测试集的准确率随着训练样本数量的变化。
训练得分:表明了模型在训练集和测试集上的准确率,可以看到随着训练样本数量的增加,模型的准确率逐渐提高。但继续增加训练样本数量可能会减少训练得分,因为模型在更大的数据集上更难拟合。
交叉验证得分:表明了模型在交叉验证集上的准确率,可以看到随着训练样本数量的增加,模型的准确率逐渐提高,但在一定数量后准确率趋于稳定。
通过观察训练得分和验证得分的变化,我们可以判断模型是否过拟合或欠拟合。如果训练得分和验证得分之间的差距较大,可能是过拟合;如果两者都较低,可能是欠拟合。
plot_learning_curve(classifier_sigmoid, "Sigmoid 核 SVM 学习曲线 - 卢继鹏", X_train, y_train, cv=5)
y_pred_sigmoid = classifier_sigmoid.predict(X_test)
使用sklearn.metrics库中的confusion_matrix函数,得到混淆矩阵,可以看到模型在测试集上的分类效果。
draw_confusion_matrix(y_test, y_pred_sigmoid, 'Sigmoid核 SVM 混淆矩阵 - 卢继鹏')
使用skilearn.metrics库中的accuracy_score和classification_report等函数,得到模型的准确率、精确率、召回率、F1值等评价指标。
draw_metrics(y_test, y_pred_sigmoid, 'Sigmoid核 SVM 模型评估指标 - 卢继鹏')
draw_classification_report(y_test, y_pred_sigmoid, 'Sigmoid核 SVM 分类报告 - 卢继鹏')
Accuracy: 0.9123, Precision: 0.9122, Recall: 0.9123, F1-Score: 0.9120
{'Benign': {'precision': 0.9130434782608695, 'recall': 0.9402985074626866, 'f1-score': 0.9264705882352942, 'support': 67.0}, 'Malignant': {'precision': 0.9111111111111111, 'recall': 0.8723404255319149, 'f1-score': 0.8913043478260869, 'support': 47.0}, 'accuracy': 0.9122807017543859, 'macro avg': {'precision': 0.9120772946859903, 'recall': 0.9063194664973008, 'f1-score': 0.9088874680306905, 'support': 114.0}, 'weighted avg': {'precision': 0.9122468005763199, 'recall': 0.9122807017543859, 'f1-score': 0.9119722259613227, 'support': 114.0}}
使用sklearn.metrics库中的roc_curve函数,计算ROC曲线,使用matplotlib库绘制ROC曲线,评估模型的分类效果。
draw_roc(y_test, y_pred_sigmoid, 'Sigmoid核 SVM ROC曲线 - 卢继鹏')
classifier_linear =SVC(kernel='linear', random_state=0)
classifier_linear.fit(X_train, y_train)
SVC(kernel='linear', random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(kernel='linear', random_state=0)
draw_decision_boundary(X_train, y_train, classifier_linear, '线性核 SVM 决策边界 - 卢继鹏', x_columns)
plot_learning_curve(classifier_linear, "线性核 SVM 学习曲线 - 卢继鹏", X_train, y_train, cv=5)
模型评估¶
y_pred_linear = classifier_linear.predict(X_test)
draw_confusion_matrix(y_test, y_pred_linear, '线性核SVM 混淆矩阵 - 卢继鹏')
draw_metrics(y_test, y_pred_linear, '线性核SVM 模型评估指标 - 卢继鹏')
draw_classification_report(y_test, y_pred_linear, '线性核SVM 分类报告 - 卢继鹏')
Accuracy: 0.9737, Precision: 0.9738, Recall: 0.9737, F1-Score: 0.9736
{'Benign': {'precision': 0.9705882352941176, 'recall': 0.9850746268656716, 'f1-score': 0.9777777777777777, 'support': 67.0}, 'Malignant': {'precision': 0.9782608695652174, 'recall': 0.9574468085106383, 'f1-score': 0.967741935483871, 'support': 47.0}, 'accuracy': 0.9736842105263158, 'macro avg': {'precision': 0.9744245524296675, 'recall': 0.9712607176881549, 'f1-score': 0.9727598566308244, 'support': 114.0}, 'weighted avg': {'precision': 0.9737515143357114, 'recall': 0.9736842105263158, 'f1-score': 0.9736401936741494, 'support': 114.0}}
draw_roc(y_test, y_pred_linear, '线性核SVM ROC曲线 - 卢继鹏')
classifier_rbf =SVC(kernel='rbf', random_state=0)
classifier_rbf.fit(X_train, y_train)
SVC(random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(random_state=0)
draw_decision_boundary(X_train, y_train, classifier_rbf, '高斯核 SVM 决策边界 - 卢继鹏', x_columns)
plot_learning_curve(classifier_rbf, "高斯核 SVM 学习曲线 - 卢继鹏", X_train, y_train, cv=5)
模型评估¶
y_pred_rbf = classifier_rbf.predict(X_test)
draw_confusion_matrix(y_test, y_pred_rbf, '高斯核SVM 混淆矩阵 - 卢继鹏')
draw_metrics(y_test, y_pred_rbf, '高斯核SVM 模型评估指标 - 卢继鹏')
draw_classification_report(y_test, y_pred_rbf, '高斯核SVM 分类报告 - 卢继鹏')
Accuracy: 0.9649, Precision: 0.9653, Recall: 0.9649, F1-Score: 0.9648
{'Benign': {'precision': 0.9565217391304348, 'recall': 0.9850746268656716, 'f1-score': 0.9705882352941176, 'support': 67.0}, 'Malignant': {'precision': 0.9777777777777777, 'recall': 0.9361702127659575, 'f1-score': 0.9565217391304348, 'support': 47.0}, 'accuracy': 0.9649122807017544, 'macro avg': {'precision': 0.9671497584541062, 'recall': 0.9606224198158145, 'f1-score': 0.9635549872122762, 'support': 114.0}, 'weighted avg': {'precision': 0.9652851936604796, 'recall': 0.9649122807017544, 'f1-score': 0.9647888903845291, 'support': 114.0}}
draw_roc(y_test, y_pred_rbf, '高斯核SVM ROC曲线 - 卢继鹏')
三种模型的比较¶
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
fpr1, tpr1, thresholds1 = roc_curve(y_test, y_pred_sigmoid)
auc1 = roc_auc_score(y_test, y_pred_sigmoid)
fpr2, tpr2, thresholds2 = roc_curve(y_test, y_pred_linear)
auc2 = roc_auc_score(y_test, y_pred_linear)
fpr3, tpr3, thresholds3 = roc_curve(y_test, y_pred_rbf)
auc3 = roc_auc_score(y_test, y_pred_rbf)
# 绘制ROC曲线
plt.figure()
plt.plot(fpr1, tpr1, label='Sigmoid 核SVM ROC curve (area = %0.2f)' % auc1)
plt.plot(fpr2, tpr2, label='线性核SVM ROC curve (area = %0.2f)' % auc2)
plt.plot(fpr3, tpr3, label='高斯核SVM ROC curve (area = %0.2f)' % auc3)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('不同核函数SVM ROC曲线对比 - 卢继鹏')
plt.legend(loc="lower right")
plt.show()
def plot_combined_learning_curve(classifiers, titles, X, y, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
plt.figure(figsize=(10, 7))
colors = ['#FF0000', '#FFC000', '#02ABEC' ]
colors_secondaries = ['#FF4444', '#FFD066', '#40AAEC']
for i, (clf, title) in enumerate(zip(classifiers, titles)):
train_sizes, train_scores, test_scores = learning_curve(
clf, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.plot(train_sizes, train_scores_mean, 'o-', label=f"{title} - Training score", color=colors[i])
plt.plot(train_sizes, test_scores_mean, 'x-', label=f"{title} - Cross-validation score", color=colors_secondaries[i])
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1, color=colors[i])
# plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
# test_scores_mean + test_scores_std, alpha=0.1, color=colors_secondaries[i])
plt.title("不同核函数SVM 学习曲线对比 - 卢继鹏")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.legend(loc="best")
plt.grid()
plt.show()
# 使用你的数据和分类器
classifiers = [classifier_sigmoid, classifier_linear, classifier_rbf]
titles = ["Sigmoid核 SVM", "线性核 SVM", "高斯核 SVM"]
plot_combined_learning_curve(classifiers, titles, X_train, y_train, cv=5)